library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attache Paket: 'mosaic'
## Die folgenden Objekte sind maskiert von 'package:dplyr':
##
## count, do, tally
## Das folgende Objekt ist maskiert 'package:Matrix':
##
## mean
## Das folgende Objekt ist maskiert 'package:ggplot2':
##
## stat
## Die folgenden Objekte sind maskiert von 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## Die folgenden Objekte sind maskiert von 'package:base':
##
## max, mean, min, prod, range, sample, sum
library(plotly)
##
## Attache Paket: 'plotly'
## Das folgende Objekt ist maskiert 'package:mosaic':
##
## do
## Das folgende Objekt ist maskiert 'package:ggplot2':
##
## last_plot
## Das folgende Objekt ist maskiert 'package:stats':
##
## filter
## Das folgende Objekt ist maskiert 'package:graphics':
##
## layout
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
library(rpart)
library(caret)
##
## Attache Paket: 'caret'
## Das folgende Objekt ist maskiert 'package:mosaic':
##
## dotPlot
library(psych)
##
## Attache Paket: 'psych'
## Die folgenden Objekte sind maskiert von 'package:mosaic':
##
## logit, rescale
## Die folgenden Objekte sind maskiert von 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
library(ggcorrplot)
library(rela)
# Delete all variables
rm( list = ls() )
read_idle = read.csv("01_Idle.csv")
idle_data <- data.frame(read_idle)
read_run = read.csv("02_Running.csv")
run_data <- data.frame(read_run)
read_dab = read.csv("03_Dab.csv")
dab_data <- data.frame(read_dab)
read_siu = read.csv("04_Siu.csv")
siu_data <- data.frame(read_siu)
Rename ID correctly:
names(idle_data)[1] <- "ID"
names(run_data)[1] <- "ID"
names(dab_data)[1] <- "ID"
names(siu_data)[1] <- "ID"
Overall in total there are 8985 rows
So in the dab data for Orientation.X and Orientation.Z we have the wrong data type.
Basically we can’t scale before converting to numeric
idle_run <- rbind(idle_data, run_data)
irun_dab <- rbind(idle_run, dab_data)
motion_data <- rbind(irun_dab, siu_data)
idle_run$Orientation.X <- as.numeric(idle_run$Orientation.X)
colSums(is.na(idle_run))
## ID Author Category
## 0 0 0
## Sample Acceleration.Timestamp Acceleration.X
## 0 0 0
## Acceleration.Y Acceleration.Z AngularVelocity.X
## 0 0 0
## AngularVelocity.Y AngularVelocity.Z MagneticField.X
## 0 0 1499
## MagneticField.Y MagneticField.Z Orientation.X
## 1499 1499 0
## Orientation.Y Orientation.Z
## 0 0
motion_data_all <- data.frame(motion_data)
# Remove Magnetic, because there are many NA's in it
motion_data_all <- motion_data_all[,!names(motion_data_all) %in% c("MagneticField.X")]
motion_data_all <- motion_data_all[,!names(motion_data_all) %in% c("MagneticField.Y")]
motion_data_all <- motion_data_all[,!names(motion_data_all) %in% c("MagneticField.Z")]
# Convert columns to correct type
motion_data_all$Category <- as.factor(motion_data_all$Category)
motion_data_all$Acceleration.X <- as.numeric(motion_data_all$Acceleration.X)
motion_data_all$Orientation.X <- as.numeric(motion_data_all$Orientation.X)
motion_data_all$Orientation.Z <- as.numeric(motion_data_all$Orientation.Z)
More NA’s found after convertion
colSums(is.na(motion_data_all))
## ID Author Category
## 0 0 0
## Sample Acceleration.Timestamp Acceleration.X
## 0 0 0
## Acceleration.Y Acceleration.Z AngularVelocity.X
## 0 0 1
## AngularVelocity.Y AngularVelocity.Z Orientation.X
## 1 1 0
## Orientation.Y Orientation.Z
## 0 0
Remove the NA’s
About 8584 rows left
motion_data_all <- na.omit(motion_data_all)
colSums(is.na(motion_data_all))
## ID Author Category
## 0 0 0
## Sample Acceleration.Timestamp Acceleration.X
## 0 0 0
## Acceleration.Y Acceleration.Z AngularVelocity.X
## 0 0 0
## AngularVelocity.Y AngularVelocity.Z Orientation.X
## 0 0 0
## Orientation.Y Orientation.Z
## 0 0
Scale the data:
quant_var <- select(motion_data_all, c(6:14))
cat_var <- select(motion_data_all, c(3))
quant_var <- scale(quant_var)
motion_data_scale <- cbind(cat_var, quant_var)
motion_data_scale
motion_data_box <- select(motion_data_scale, c("Acceleration.X","Acceleration.Y","Acceleration.Z","AngularVelocity.X","AngularVelocity.Y","AngularVelocity.Z"))
boxplot(motion_data_box) +
scale_x_discrete(guide = guide_axis(angle = 90))
## NULL
#geom_violin(trim = FALSE) +
#geom_boxplot()
#theme_minimal()
Train with data from Ahmed, Tobias, Saghar and Ronaldo
#motion_data_part <- subset(motion_data_all, Author == "Ahmed" | Author == "Tobias" | Author == "Saghar" | Author == "Ronaldo") #+ subset(motion_data_all, Author == "Tobias")
#motion_data_unknown <- subset(motion_data, Author == "Regan" | Author == "Darian") # 33 %
motion_data_part <- subset(motion_data_all, Author == "Ahmed" | Author == "Tobias"| Author == "Ronaldo"| Author == "Regan")
motion_data_test <- subset(motion_data_all, Author == "Saghar")
colSums(is.na(motion_data_part))
## ID Author Category
## 0 0 0
## Sample Acceleration.Timestamp Acceleration.X
## 0 0 0
## Acceleration.Y Acceleration.Z AngularVelocity.X
## 0 0 0
## AngularVelocity.Y AngularVelocity.Z Orientation.X
## 0 0 0
## Orientation.Y Orientation.Z
## 0 0
# For statistics
motion_data_all_stat <- data.frame(motion_data_all)
# Remove unrelevant columns
motion_data_all <- motion_data_all[,!names(motion_data_all) %in% c("ID", "Acceleration.Timestamp", "Author", "Sample")]
Write merged cleaned data to file:
write.csv(motion_data_all, "All Samples Clean.csv", row.names = FALSE)
Stacked bar chart:
Seems like Darian and Ahmed have more compared to the others more motion data
cat_count <- group_by(motion_data_all_stat, Author, Category) %>%
summarize(count=n())
## `summarise()` has grouped output by 'Author'. You can override using the
## `.groups` argument.
stack_bar <- ggplot(cat_count, aes(x = Author, y = count, fill = Category)) +
geom_bar(stat = "identity") #+
#geom_text(aes(label = count), vjust = -4.5)
ggplotly(stack_bar)
Threshold: 0.2
Old one: Remaining features: Acceleration.X, Acceleration.Z, Orientation.X, Orientation.Y, Orientation.Z
New one: Remaining features: Acceleration.X, Acceleration.Y, Acceleration.Z, AngularVelocity.X, AngularVelocity.Y, AngularVelocity.Z
We remove the orientation, since everyone had a different phone position
motion_data_all_numeric <- data.frame(motion_data_all)
motion_data_all_numeric <- motion_data_all_numeric[,!names(motion_data_all_numeric) %in% c("Category")]
#Was for only for testing -> Darian: Everyone has different position of phone, thats why we should skip Orientation
#motion_data_all_numeric <- motion_data_all_numeric[,!names(motion_data_all_numeric) %in% c("Orientation.X", "Orientation.Y", "Orientation.Z")]
#motion_data_all_numeric$Category <- as.numeric(factor(motion_data_all_numeric$Category))
#motion_data_all_numeric$Category <- as.factor(motion_data_all_numeric$Category)
# Calculate the correlation matrix of the data frame
cor_matrix <- cor(motion_data_all_numeric)
# Visualize the correlation matrix using ggcorrplot
ggcorrplot(cor_matrix, hc.order = TRUE, type = "lower",
lab = TRUE, lab_size = 3, method = "circle")
We use only relevant columns for the model training
remove_col <- c("ID", "Acceleration.Timestamp", "Author", "Sample", "Orientation.X", "Orientation.Y", "Orientation.Z")
motion_data_part <- motion_data_part[,!names(motion_data_part) %in% remove_col]
plot_data <- data.frame(motion_data_test)
plot_data <- plot_data[,!names(plot_data) %in% remove_col]
motion_data_part_numeric <- data.frame(plot_data)
motion_data_part_numeric <- motion_data_part_numeric[,!names(motion_data_part_numeric) %in% c("Category")]
#idle_tobias <- subset(motion_data_tobias[1:5], Category == "Idle")
ggpairs(data=motion_data_part_numeric,aes(color = plot_data$Category), title="Motion pair plot with quantiative variables",
upper = list(
continuous = wrap("cor", size = 0.75)
)
)
remove_col <- c("ID", "Author", "Sample", "Orientation.X", "Orientation.Y", "Orientation.Z")
idle_activity = subset(motion_data, Category == "Running" & Author == "Tobias")
idle_activity <- idle_activity[,!names(idle_activity) %in% remove_col]
#test <- scale_x_datetime(breaks = date_breaks("1 hours"), labels=date_format("%H:%m"), expand = c(0,0))
#test
idle_plot <- group_by(idle_activity, Category) %>%
ggplot(aes(x=Acceleration.Timestamp)) +
labs( x = "Timestamp", y = "Acceleration") +
geom_line(aes(y = Acceleration.X), color="dark green", alpha = 0.8) +
geom_line(aes(y = Acceleration.Y), color="light blue", alpha = 0.8) +
geom_line(aes(y = Acceleration.Z), color="dark orange", alpha = 0.8)
ggplotly(idle_plot)
Train split: 80 %, Test split: 20 %
Since the features that we selected correlate good and are relevant, we skip the angular velocity
set.seed(10)
# Take variables from correlation analysis
feature_selection <- motion_data_part#[,c("Category", "Acceleration.X", "Acceleration.Y", "Acceleration.Z")]
train_index_all <- createDataPartition(feature_selection$Category, p =0.80, list = FALSE)
train_data_all<-feature_selection[train_index_all, ]
test_data_all<-feature_selection[-train_index_all, ]
set.seed(6)
# 6: 89.8 %
control_par <- trainControl(method = "cv", number=4)
model_rf_all <- train(Category~.,
data=train_data_all,
"rf",
trControl = control_par
)
model_rf_all
## Random Forest
##
## 4355 samples
## 6 predictor
## 4 classes: 'Dab', 'Idle', 'Running', 'Siu'
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
## Summary of sample sizes: 3267, 3267, 3265, 3266
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8378841 0.7835066
## 4 0.8300790 0.7730415
## 6 0.8234207 0.7641610
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
Random forest with cross validation 4 fold
cm_train_data <- confusionMatrix(model_rf_all)
cm_train_data
## Cross-Validated (4 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction Dab Idle Running Siu
## Dab 18.0 0.8 1.0 2.0
## Idle 1.0 23.6 1.1 0.4
## Running 1.9 1.2 22.3 2.3
## Siu 1.9 0.7 1.8 19.9
##
## Accuracy (average) : 0.8379
set.seed(6)
## Generate predictions
rf_all_pred_test <- predict(model_rf_all,test_data_all)
## Print the accuracy
accuracy <- mean(rf_all_pred_test == test_data_all$Category)*100
accuracy
## [1] 83.88582
cm_test_data <- confusionMatrix(rf_all_pred_test, test_data_all$Category)
cm_test_data
## Confusion Matrix and Statistics
##
## Reference
## Prediction Dab Idle Running Siu
## Dab 200 11 13 19
## Idle 11 254 12 8
## Running 14 16 245 27
## Siu 23 6 15 212
##
## Overall Statistics
##
## Accuracy : 0.8389
## 95% CI : (0.8156, 0.8602)
## No Information Rate : 0.2643
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7848
##
## Mcnemar's Test P-Value : 0.5823
##
## Statistics by Class:
##
## Class: Dab Class: Idle Class: Running Class: Siu
## Sensitivity 0.8065 0.8850 0.8596 0.7970
## Specificity 0.9487 0.9612 0.9288 0.9463
## Pos Pred Value 0.8230 0.8912 0.8113 0.8281
## Neg Pred Value 0.9431 0.9588 0.9490 0.9349
## Prevalence 0.2284 0.2643 0.2624 0.2449
## Detection Rate 0.1842 0.2339 0.2256 0.1952
## Detection Prevalence 0.2238 0.2624 0.2781 0.2357
## Balanced Accuracy 0.8776 0.9231 0.8942 0.8717
plt <- as.data.frame(cm_test_data$table)
plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))
rf_conf_mat <- ggplot(plt, aes(Prediction,Reference, fill= Freq)) +
geom_tile() + geom_text(aes(label=Freq)) +
scale_fill_gradient(low="white", high="#009194") +
labs(x = "Prediction",y = "Reference") +
scale_y_discrete(labels=c("Dab","Idle","Running","Siu")) +
scale_x_discrete(labels=c("Siu", "Running", "Idle", "Dab"))
ggplotly(rf_conf_mat)
remove_col <- c("ID", "Acceleration.Timestamp", "Author", "Orientation.X", "Orientation.Y", "Orientation.Z")
motion_data_test <- motion_data_test[,!names(motion_data_test) %in% remove_col]
motion_data_test$Sample <- as.numeric(as.factor(motion_data_test$Sample))
unique(motion_data_test$Category)
## [1] Idle Running Dab Siu
## Levels: Dab Idle Running Siu
Dab: 1 - 20 Idle: 11 - 20 Run: 22 - 30 Siu: 31 - 40
inspect(motion_data_test)
##
## categorical variables:
## name class levels n missing
## 1 Category factor 4 1420 0
## distribution
## 1 Running (47.6%), Dab (22.1%) ...
##
## quantitative variables:
## name class min Q1 median Q3 max
## 1 Sample numeric 1.00000 13.0000000 22.000000 27.0000000 40.00000
## 2 Acceleration.X numeric -19.24533 5.6348475 8.989855 10.3458525 74.95678
## 3 Acceleration.Y numeric -62.43217 -2.5990100 -1.087490 -0.0446425 10.99254
## 4 Acceleration.Z numeric -27.55201 -1.3236225 1.985545 4.8890625 40.44529
## 5 AngularVelocity.X numeric -7.90234 -0.5865550 -0.007295 0.5926225 12.85294
## 6 AngularVelocity.Y numeric -7.73286 -0.2825200 0.029700 0.3547175 10.01106
## 7 AngularVelocity.Z numeric -12.65705 -0.2874675 -0.006055 0.2679275 7.92185
## mean sd n missing
## 1 20.17323944 9.586790 1420 0
## 2 8.94612076 8.124970 1420 0
## 3 -2.02695663 5.929216 1420 0
## 4 1.79015740 5.362013 1420 0
## 5 0.03390892 1.698911 1420 0
## 6 0.05902677 1.120474 1420 0
## 7 -0.03541651 1.196526 1420 0
Dab is not recognized at all: 10/10 are missclassified
Idle: 10 / 10 Samples with at least 70 % correct
Running: 10 / 10 Samples with at least 60 % correct
Siu: 9 / 10 Samples with at least 50 % correct
In total we have an avg accuracy of 60 %
list_motion_data_unknown = c()
total_accuracy <- 0
average_accuracy <- 0
for(i in 1:length(unique(motion_data_test$Sample))){
#print(i)
motion_data_unknown <- subset(motion_data_test,Sample == i) # 55.76 %
ref <- motion_data_unknown$Category[motion_data_unknown$Sample == i]
motion_data_unknown <- motion_data_unknown[,!names(motion_data_unknown) %in% c("Sample")]
motion_data_no_labels <- data.frame(motion_data_unknown)
names(motion_data_no_labels)[names(motion_data_no_labels) == "Category"] <- "Category"
motion_data_no_labels$Category <- ""
set.seed(6)
## Generate predictions
rf_dab_pred_new <- predict(object = model_rf_all,newdata = motion_data_no_labels)
## Print the accuracy
accuracy <- mean(rf_dab_pred_new == motion_data_unknown$Category )*100
total_accuracy <- total_accuracy + accuracy
motion_data_no_labels$Category = rf_dab_pred_new
cm_rf_all <- confusionMatrix(rf_dab_pred_new, motion_data_no_labels$Category)
#print(cm_rf_all)
test <- as.data.frame(cm_rf_all$table)
print(paste("Reference: ", unique(ref), "Prediction: ", test$Prediction[which.max(test$Freq)], "Accuracy: ", accuracy, sep = " "))
list_motion_data_unknown <- append(list_motion_data_unknown, motion_data_no_labels)
}
## [1] "Reference: Idle Prediction: Dab Accuracy: 48.4848484848485"
## [1] "Reference: Idle Prediction: Idle Accuracy: 76.6666666666667"
## [1] "Reference: Idle Prediction: Dab Accuracy: 14.2857142857143"
## [1] "Reference: Idle Prediction: Idle Accuracy: 80"
## [1] "Reference: Idle Prediction: Idle Accuracy: 76.6666666666667"
## [1] "Reference: Idle Prediction: Idle Accuracy: 63.3333333333333"
## [1] "Reference: Idle Prediction: Dab Accuracy: 48.3870967741936"
## [1] "Reference: Idle Prediction: Idle Accuracy: 64.5161290322581"
## [1] "Reference: Idle Prediction: Dab Accuracy: 18.5185185185185"
## [1] "Reference: Idle Prediction: Idle Accuracy: 85.1851851851852"
## [1] "Reference: Dab Prediction: Dab Accuracy: 78.125"
## [1] "Reference: Dab Prediction: Dab Accuracy: 65.625"
## [1] "Reference: Dab Prediction: Dab Accuracy: 68.75"
## [1] "Reference: Dab Prediction: Dab Accuracy: 71.875"
## [1] "Reference: Dab Prediction: Dab Accuracy: 68.75"
## [1] "Reference: Dab Prediction: Dab Accuracy: 62.5"
## [1] "Reference: Dab Prediction: Dab Accuracy: 68.75"
## [1] "Reference: Dab Prediction: Dab Accuracy: 62.5"
## [1] "Reference: Dab Prediction: Siu Accuracy: 43.75"
## [1] "Reference: Dab Prediction: Dab Accuracy: 57.6923076923077"
## [1] "Reference: Running Prediction: Running Accuracy: 53.968253968254"
## [1] "Reference: Running Prediction: Running Accuracy: 77.6119402985075"
## [1] "Reference: Running Prediction: Running Accuracy: 73.5294117647059"
## [1] "Reference: Running Prediction: Running Accuracy: 78.3132530120482"
## [1] "Reference: Running Prediction: Running Accuracy: 73.3333333333333"
## [1] "Reference: Running Prediction: Running Accuracy: 75"
## [1] "Reference: Running Prediction: Running Accuracy: 76.4705882352941"
## [1] "Reference: Running Prediction: Running Accuracy: 71.875"
## [1] "Reference: Running Prediction: Running Accuracy: 70.9677419354839"
## [1] "Reference: Running Prediction: Running Accuracy: 60.3448275862069"
## [1] "Reference: Siu Prediction: Siu Accuracy: 50"
## [1] "Reference: Siu Prediction: Siu Accuracy: 60"
## [1] "Reference: Siu Prediction: Siu Accuracy: 88.2352941176471"
## [1] "Reference: Siu Prediction: Siu Accuracy: 93.3333333333333"
## [1] "Reference: Siu Prediction: Siu Accuracy: 87.5"
## [1] "Reference: Siu Prediction: Running Accuracy: 0"
## [1] "Reference: Siu Prediction: Siu Accuracy: 80"
## [1] "Reference: Siu Prediction: Siu Accuracy: 84.6153846153846"
## [1] "Reference: Siu Prediction: Siu Accuracy: 56.25"
## [1] "Reference: Siu Prediction: Siu Accuracy: 100"
average_accuracy <- total_accuracy / length(unique(motion_data_test$Sample))
print(paste("AVG Accuracy: ", average_accuracy))
## [1] "AVG Accuracy: 65.8927457209973"
set.seed(6)
# 6: 89.8 %
control_par <- trainControl(method = "cv", number=4)
model_knn <- train(Category~.,
data=train_data_all,
"knn",
trControl = control_par,
metric = "Accuracy"
)
model_knn
## k-Nearest Neighbors
##
## 4355 samples
## 6 predictor
## 4 classes: 'Dab', 'Idle', 'Running', 'Siu'
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
## Summary of sample sizes: 3267, 3267, 3265, 3266
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8020649 0.7358098
## 7 0.7993080 0.7320936
## 9 0.8006850 0.7339331
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
KNN with cross validation 4 fold
cm_train_data <- confusionMatrix(model_knn)
cm_train_data
## Cross-Validated (4 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction Dab Idle Running Siu
## Dab 18.5 0.8 2.0 3.0
## Idle 1.8 23.3 2.3 1.0
## Running 1.0 1.4 20.3 2.5
## Siu 1.5 0.8 1.7 18.0
##
## Accuracy (average) : 0.8021
set.seed(6)
## Generate predictions
knn_all_pred_test <- predict(model_knn,test_data_all)
## Print the accuracy
accuracy <- mean(knn_all_pred_test == test_data_all$Category)*100
accuracy
## [1] 80.01842
cm_test_data <- confusionMatrix(knn_all_pred_test, test_data_all$Category)
cm_test_data
## Confusion Matrix and Statistics
##
## Reference
## Prediction Dab Idle Running Siu
## Dab 200 7 22 22
## Idle 20 255 29 11
## Running 10 16 219 38
## Siu 18 9 15 195
##
## Overall Statistics
##
## Accuracy : 0.8002
## 95% CI : (0.7751, 0.8236)
## No Information Rate : 0.2643
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7331
##
## Mcnemar's Test P-Value : 0.0003278
##
## Statistics by Class:
##
## Class: Dab Class: Idle Class: Running Class: Siu
## Sensitivity 0.8065 0.8885 0.7684 0.7331
## Specificity 0.9391 0.9249 0.9201 0.9488
## Pos Pred Value 0.7968 0.8095 0.7739 0.8228
## Neg Pred Value 0.9425 0.9585 0.9178 0.9164
## Prevalence 0.2284 0.2643 0.2624 0.2449
## Detection Rate 0.1842 0.2348 0.2017 0.1796
## Detection Prevalence 0.2311 0.2901 0.2606 0.2182
## Balanced Accuracy 0.8728 0.9067 0.8443 0.8409
plt <- as.data.frame(cm_test_data$table)
plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))
rf_conf_mat <- ggplot(plt, aes(Prediction,Reference, fill= Freq)) +
geom_tile() + geom_text(aes(label=Freq)) +
scale_fill_gradient(low="white", high="#009194") +
labs(x = "Prediction",y = "Reference") +
scale_y_discrete(labels=c("Dab","Idle","Running","Siu")) +
scale_x_discrete(labels=c("Siu", "Running", "Idle", "Dab"))
ggplotly(rf_conf_mat)
remove_col <- c("ID", "Acceleration.Timestamp", "Author", "Orientation.X", "Orientation.Y", "Orientation.Z")
motion_data_test <- motion_data_test[,!names(motion_data_test) %in% remove_col]
motion_data_test$Sample <- as.numeric(as.factor(motion_data_test$Sample))
unique(motion_data_test$Category)
## [1] Idle Running Dab Siu
## Levels: Dab Idle Running Siu
Dab: 1 - 20 Idle: 11 - 20 Run: 22 - 30 Siu: 31 - 40
inspect(motion_data_test)
##
## categorical variables:
## name class levels n missing
## 1 Category factor 4 1420 0
## distribution
## 1 Running (47.6%), Dab (22.1%) ...
##
## quantitative variables:
## name class min Q1 median Q3 max
## 1 Sample numeric 1.00000 13.0000000 22.000000 27.0000000 40.00000
## 2 Acceleration.X numeric -19.24533 5.6348475 8.989855 10.3458525 74.95678
## 3 Acceleration.Y numeric -62.43217 -2.5990100 -1.087490 -0.0446425 10.99254
## 4 Acceleration.Z numeric -27.55201 -1.3236225 1.985545 4.8890625 40.44529
## 5 AngularVelocity.X numeric -7.90234 -0.5865550 -0.007295 0.5926225 12.85294
## 6 AngularVelocity.Y numeric -7.73286 -0.2825200 0.029700 0.3547175 10.01106
## 7 AngularVelocity.Z numeric -12.65705 -0.2874675 -0.006055 0.2679275 7.92185
## mean sd n missing
## 1 20.17323944 9.586790 1420 0
## 2 8.94612076 8.124970 1420 0
## 3 -2.02695663 5.929216 1420 0
## 4 1.79015740 5.362013 1420 0
## 5 0.03390892 1.698911 1420 0
## 6 0.05902677 1.120474 1420 0
## 7 -0.03541651 1.196526 1420 0
Dab is not recognized at all: 10/10 are missclassified
Idle: 10 / 10 Samples with at least 70 % correct
Running: 10 / 10 Samples with at least 60 % correct
Siu: 9 / 10 Samples with at least 50 % correct
In total we have an avg accuracy of 60 %
total_accuracy <- 0
average_accuracy <- 0
for(i in 1:length(unique(motion_data_test$Sample))){
#print(i)
motion_data_unknown <- subset(motion_data_test,Sample == i) # 55.76 %
ref <- motion_data_unknown$Category[motion_data_unknown$Sample == i]
motion_data_unknown <- motion_data_unknown[,!names(motion_data_unknown) %in% c("Sample")]
motion_data_no_labels <- data.frame(motion_data_unknown)
names(motion_data_no_labels)[names(motion_data_no_labels) == "Category"] <- "Category"
motion_data_no_labels$Category <- ""
set.seed(6)
## Generate predictions
knn_pred_new <- predict(object = model_knn,newdata = motion_data_no_labels)
## Print the accuracy
accuracy <- mean(knn_pred_new == motion_data_unknown$Category )*100
total_accuracy <- total_accuracy + accuracy
motion_data_no_labels$Category = knn_pred_new
cm_rf_all <- confusionMatrix(knn_pred_new, motion_data_no_labels$Category)
#print(cm_rf_all)
test <- as.data.frame(cm_rf_all$table)
print(paste("Reference: ", unique(ref), "Prediction: ", test$Prediction[which.max(test$Freq)], "Accuracy: ", accuracy, sep = " "))
}
## [1] "Reference: Idle Prediction: Idle Accuracy: 66.6666666666667"
## [1] "Reference: Idle Prediction: Idle Accuracy: 93.3333333333333"
## [1] "Reference: Idle Prediction: Idle Accuracy: 57.1428571428571"
## [1] "Reference: Idle Prediction: Idle Accuracy: 96"
## [1] "Reference: Idle Prediction: Idle Accuracy: 86.6666666666667"
## [1] "Reference: Idle Prediction: Idle Accuracy: 83.3333333333333"
## [1] "Reference: Idle Prediction: Idle Accuracy: 90.3225806451613"
## [1] "Reference: Idle Prediction: Idle Accuracy: 90.3225806451613"
## [1] "Reference: Idle Prediction: Idle Accuracy: 74.0740740740741"
## [1] "Reference: Idle Prediction: Idle Accuracy: 100"
## [1] "Reference: Dab Prediction: Dab Accuracy: 68.75"
## [1] "Reference: Dab Prediction: Dab Accuracy: 50"
## [1] "Reference: Dab Prediction: Dab Accuracy: 46.875"
## [1] "Reference: Dab Prediction: Dab Accuracy: 50"
## [1] "Reference: Dab Prediction: Dab Accuracy: 40.625"
## [1] "Reference: Dab Prediction: Dab Accuracy: 46.875"
## [1] "Reference: Dab Prediction: Dab Accuracy: 53.125"
## [1] "Reference: Dab Prediction: Dab Accuracy: 50"
## [1] "Reference: Dab Prediction: Siu Accuracy: 25"
## [1] "Reference: Dab Prediction: Dab Accuracy: 65.3846153846154"
## [1] "Reference: Running Prediction: Running Accuracy: 63.4920634920635"
## [1] "Reference: Running Prediction: Running Accuracy: 74.6268656716418"
## [1] "Reference: Running Prediction: Running Accuracy: 77.9411764705882"
## [1] "Reference: Running Prediction: Running Accuracy: 73.4939759036145"
## [1] "Reference: Running Prediction: Running Accuracy: 78.6666666666667"
## [1] "Reference: Running Prediction: Running Accuracy: 75"
## [1] "Reference: Running Prediction: Running Accuracy: 75"
## [1] "Reference: Running Prediction: Running Accuracy: 64.0625"
## [1] "Reference: Running Prediction: Running Accuracy: 62.9032258064516"
## [1] "Reference: Running Prediction: Running Accuracy: 53.448275862069"
## [1] "Reference: Siu Prediction: Running Accuracy: 40"
## [1] "Reference: Siu Prediction: Siu Accuracy: 45"
## [1] "Reference: Siu Prediction: Siu Accuracy: 70.5882352941177"
## [1] "Reference: Siu Prediction: Siu Accuracy: 53.3333333333333"
## [1] "Reference: Siu Prediction: Siu Accuracy: 68.75"
## [1] "Reference: Siu Prediction: Siu Accuracy: 100"
## [1] "Reference: Siu Prediction: Siu Accuracy: 60"
## [1] "Reference: Siu Prediction: Siu Accuracy: 76.9230769230769"
## [1] "Reference: Siu Prediction: Siu Accuracy: 43.75"
## [1] "Reference: Siu Prediction: Siu Accuracy: 66.6666666666667"
average_accuracy <- total_accuracy / length(unique(motion_data_test$Sample))
average_accuracy
## [1] 66.45357
set.seed(6)
# 6: 89.8 %
control_par <- trainControl(method = "cv", number=4)
model_rpart <- train(Category~.,
data=train_data_all,
"rpart",
trControl = control_par,
metric = "Accuracy"
)
model_rpart
## CART
##
## 4355 samples
## 6 predictor
## 4 classes: 'Dab', 'Idle', 'Running', 'Siu'
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
## Summary of sample sizes: 3267, 3267, 3265, 3266
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.08143526 0.5161941 0.3482533
## 0.08642746 0.4877288 0.3098354
## 0.22745710 0.3048900 0.0572642
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.08143526.
# Basic plot for a decision tree
plot(model_rpart$finalModel,branch = T, margin = 0.1)
text(model_rpart$finalModel)
Rpart with cross validation 4 fold
cm_train_data <- confusionMatrix(model_rpart)
cm_train_data
## Cross-Validated (4 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction Dab Idle Running Siu
## Dab 0.0 0.0 0.0 0.0
## Idle 10.9 24.4 10.6 4.7
## Running 3.1 0.8 10.1 2.7
## Siu 8.8 1.1 5.6 17.1
##
## Accuracy (average) : 0.5162
set.seed(6)
## Generate predictions
rpart_all_pred_test <- predict(model_rpart,test_data_all)
## Print the accuracy
accuracy <- mean(rpart_all_pred_test == test_data_all$Category)*100
accuracy
## [1] 49.07919
cm_test_data <- confusionMatrix(rpart_all_pred_test, test_data_all$Category)
cm_test_data
## Confusion Matrix and Statistics
##
## Reference
## Prediction Dab Idle Running Siu
## Dab 0 0 0 0
## Idle 111 272 145 65
## Running 19 2 71 11
## Siu 118 13 69 190
##
## Overall Statistics
##
## Accuracy : 0.4908
## 95% CI : (0.4607, 0.521)
## No Information Rate : 0.2643
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3145
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Dab Class: Idle Class: Running Class: Siu
## Sensitivity 0.0000 0.9477 0.24912 0.7143
## Specificity 1.0000 0.5982 0.96005 0.7561
## Pos Pred Value NaN 0.4587 0.68932 0.4872
## Neg Pred Value 0.7716 0.9696 0.78230 0.8908
## Prevalence 0.2284 0.2643 0.26243 0.2449
## Detection Rate 0.0000 0.2505 0.06538 0.1750
## Detection Prevalence 0.0000 0.5460 0.09484 0.3591
## Balanced Accuracy 0.5000 0.7730 0.60459 0.7352
plt <- as.data.frame(cm_test_data$table)
plt$Prediction <- factor(plt$Prediction, levels=rev(levels(plt$Prediction)))
rf_conf_mat <- ggplot(plt, aes(Prediction,Reference, fill= Freq)) +
geom_tile() + geom_text(aes(label=Freq)) +
scale_fill_gradient(low="white", high="#009194") +
labs(x = "Prediction",y = "Reference") +
scale_y_discrete(labels=c("Dab","Idle","Running","Siu")) +
scale_x_discrete(labels=c("Siu", "Running", "Idle", "Dab"))
ggplotly(rf_conf_mat)
remove_col <- c("ID", "Acceleration.Timestamp", "Author", "Orientation.X", "Orientation.Y", "Orientation.Z")
motion_data_test <- motion_data_test[,!names(motion_data_test) %in% remove_col]
motion_data_test$Sample <- as.numeric(as.factor(motion_data_test$Sample))
unique(motion_data_test$Category)
## [1] Idle Running Dab Siu
## Levels: Dab Idle Running Siu
Dab: 1 - 20 Idle: 11 - 20 Run: 22 - 30 Siu: 31 - 40
inspect(motion_data_test)
##
## categorical variables:
## name class levels n missing
## 1 Category factor 4 1420 0
## distribution
## 1 Running (47.6%), Dab (22.1%) ...
##
## quantitative variables:
## name class min Q1 median Q3 max
## 1 Sample numeric 1.00000 13.0000000 22.000000 27.0000000 40.00000
## 2 Acceleration.X numeric -19.24533 5.6348475 8.989855 10.3458525 74.95678
## 3 Acceleration.Y numeric -62.43217 -2.5990100 -1.087490 -0.0446425 10.99254
## 4 Acceleration.Z numeric -27.55201 -1.3236225 1.985545 4.8890625 40.44529
## 5 AngularVelocity.X numeric -7.90234 -0.5865550 -0.007295 0.5926225 12.85294
## 6 AngularVelocity.Y numeric -7.73286 -0.2825200 0.029700 0.3547175 10.01106
## 7 AngularVelocity.Z numeric -12.65705 -0.2874675 -0.006055 0.2679275 7.92185
## mean sd n missing
## 1 20.17323944 9.586790 1420 0
## 2 8.94612076 8.124970 1420 0
## 3 -2.02695663 5.929216 1420 0
## 4 1.79015740 5.362013 1420 0
## 5 0.03390892 1.698911 1420 0
## 6 0.05902677 1.120474 1420 0
## 7 -0.03541651 1.196526 1420 0
Dab is not recognized at all: 10/10 are missclassified
Idle: 10 / 10 Samples with at least 70 % correct
Running: 10 / 10 Samples with at least 60 % correct
Siu: 9 / 10 Samples with at least 50 % correct
In total we have an avg accuracy of 60 %
total_accuracy <- 0
average_accuracy <- 0
for(i in 1:length(unique(motion_data_test$Sample))){
#print(i)
motion_data_unknown <- subset(motion_data_test,Sample == i) # 55.76 %
ref <- motion_data_unknown$Category[motion_data_unknown$Sample == i]
motion_data_unknown <- motion_data_unknown[,!names(motion_data_unknown) %in% c("Sample")]
motion_data_no_labels <- data.frame(motion_data_unknown)
names(motion_data_no_labels)[names(motion_data_no_labels) == "Category"] <- "Category"
motion_data_no_labels$Category <- ""
set.seed(6)
## Generate predictions
rpart_pred_new <- predict(object = model_rpart,newdata = motion_data_no_labels)
## Print the accuracy
accuracy <- mean(rpart_pred_new == motion_data_unknown$Category )*100
total_accuracy <- total_accuracy + accuracy
motion_data_no_labels$Category = rpart_pred_new
cm_rf_all <- confusionMatrix(rpart_pred_new, motion_data_no_labels$Category)
#print(cm_rf_all)
test <- as.data.frame(cm_rf_all$table)
print(paste("Reference: ", unique(ref), "Prediction: ", test$Prediction[which.max(test$Freq)], "Accuracy: ", accuracy, sep = " "))
}
## [1] "Reference: Idle Prediction: Idle Accuracy: 75.7575757575758"
## [1] "Reference: Idle Prediction: Idle Accuracy: 100"
## [1] "Reference: Idle Prediction: Idle Accuracy: 71.4285714285714"
## [1] "Reference: Idle Prediction: Idle Accuracy: 100"
## [1] "Reference: Idle Prediction: Idle Accuracy: 96.6666666666667"
## [1] "Reference: Idle Prediction: Idle Accuracy: 96.6666666666667"
## [1] "Reference: Idle Prediction: Idle Accuracy: 90.3225806451613"
## [1] "Reference: Idle Prediction: Idle Accuracy: 100"
## [1] "Reference: Idle Prediction: Idle Accuracy: 100"
## [1] "Reference: Idle Prediction: Idle Accuracy: 100"
## [1] "Reference: Dab Prediction: Siu Accuracy: 0"
## [1] "Reference: Dab Prediction: Siu Accuracy: 0"
## [1] "Reference: Dab Prediction: Siu Accuracy: 0"
## [1] "Reference: Dab Prediction: Siu Accuracy: 0"
## [1] "Reference: Dab Prediction: Siu Accuracy: 0"
## [1] "Reference: Dab Prediction: Siu Accuracy: 0"
## [1] "Reference: Dab Prediction: Siu Accuracy: 0"
## [1] "Reference: Dab Prediction: Idle Accuracy: 0"
## [1] "Reference: Dab Prediction: Siu Accuracy: 0"
## [1] "Reference: Dab Prediction: Siu Accuracy: 0"
## [1] "Reference: Running Prediction: Siu Accuracy: 36.5079365079365"
## [1] "Reference: Running Prediction: Siu Accuracy: 19.4029850746269"
## [1] "Reference: Running Prediction: Siu Accuracy: 22.0588235294118"
## [1] "Reference: Running Prediction: Siu Accuracy: 26.5060240963855"
## [1] "Reference: Running Prediction: Siu Accuracy: 26.6666666666667"
## [1] "Reference: Running Prediction: Siu Accuracy: 26.4705882352941"
## [1] "Reference: Running Prediction: Siu Accuracy: 26.4705882352941"
## [1] "Reference: Running Prediction: Siu Accuracy: 32.8125"
## [1] "Reference: Running Prediction: Siu Accuracy: 30.6451612903226"
## [1] "Reference: Running Prediction: Siu Accuracy: 27.5862068965517"
## [1] "Reference: Siu Prediction: Siu Accuracy: 55"
## [1] "Reference: Siu Prediction: Siu Accuracy: 70"
## [1] "Reference: Siu Prediction: Siu Accuracy: 82.3529411764706"
## [1] "Reference: Siu Prediction: Siu Accuracy: 80"
## [1] "Reference: Siu Prediction: Siu Accuracy: 93.75"
## [1] "Reference: Siu Prediction: Siu Accuracy: 100"
## [1] "Reference: Siu Prediction: Siu Accuracy: 80"
## [1] "Reference: Siu Prediction: Siu Accuracy: 100"
## [1] "Reference: Siu Prediction: Siu Accuracy: 81.25"
## [1] "Reference: Siu Prediction: Siu Accuracy: 91.6666666666667"
average_accuracy <- total_accuracy / length(unique(motion_data_test$Sample))
average_accuracy
## [1] 50.99973
Good correlation between:
Orientation.X and Orientation.Y with 0.56
Orientation.Z and Orientation.Y with 0.48
Category and Orientation.Y with 0.43
Orientation.Z and Orientation.X with 0.54
Angular Velocity is quite independent compared to other variables, but we still included it, to determine the activities better.
Since everyone used another phone position, we removed the orientation, because it also didn’t really improve the model accuracy.
We also remove magnetic field, because there were a lot of NA’s and even by using data were we didn’t have NA’s it, the models didn’t really improve with it. About 40 % on unseen data.
In total we tried three different models: Random forest, KNN and Rpart (Decision Tree).
Accuracies:
Prediction with random forest: Train data: 81.56 %, Test data: 83.37 % Unseen data: 60.48 %
Prediction with knn: Train data: 77.29 %, Test data: 82.08 % Unseen data: 59.52 %
Prediction with rpart: Train data: 55.69 %, Test data: 52.07 % Unseen data: 48.50 %
Classification:
The problem is that the models do not recognize dab at all.
The reason for that is, because Dab is kinda included into Siu.
That’s why they also missclassified Dab with Siu.
So, we decided to use another acitivity called lunge and train test the models.